# Importsimport pandas as pdimport numpy as npimport spacyimport preprocessor as pfrom emoji import demojizefrom bertopic import BERTopicimport plotly.express as pximport plotly.io as piopio.renderers.default ="plotly_mimetype+notebook_connected"
Información
Los datos de este usuario cubren desde la creación de la cuenta 2011-06-15 hasta 2023-03-01
Lista del top 20 de hashtags más usados y su frecuencia
Code
hashtags = df['hashtags'].to_list()# remove nan items from listhashtags = [x for x in hashtags ifnot pd.isna(x)]# split items into a list based on a delimiterhashtags = [x.split('|') for x in hashtags]# flatten list of listshashtags = [item for sublist in hashtags for item in sublist]# count items on listhashtags_count = pd.Series(hashtags).value_counts()top_hashtags = hashtags_count.nlargest(20)top_hashtags
# filter columnusers = df['mentioned_names'].to_list()# remove nan items from listusers = [x for x in users ifnot pd.isna(x)]# split items into a list based on a delimiterusers = [x.split('|') for x in users]# flatten list of listsusers = [item for sublist in users for item in sublist]# count items on listusers_count = pd.Series(users).value_counts()# return first n rows in descending ordertop_users = users_count.nlargest(20)top_users
# plot the data using plotlyfig = px.line(df, x='date', y='like_count', title='Número de likes en el tiempo', template='plotly_white', hover_data=['text'])# show the plotfig.show()
Tokens
Lista del top 20 de los tokens más comunes y su frecuencia
Code
# load the spacy model for Spanishnlp = spacy.load("es_core_news_sm")STOP_WORDS = nlp.Defaults.stop_wordsdef filter_stopwords(text): doc = nlp(text.lower()) tokens = [token.text for token in doc ifnot token.is_stop and token.text notin STOP_WORDS and token.is_alpha]return' '.join(tokens)df['preprocess'] = df['text'].apply(filter_stopwords)token_counts = df["preprocess"].str.split(expand=True).stack().value_counts()[:20]token_counts
vida 2070
aborto 1097
colombia 719
sialavida 661
colombiaesprovida 437
mayo 390
q 388
noalaborto 370
eutanasia 323
derecho 323
gracias 309
provida 308
muerte 268
feliz 268
d 263
voz 250
mujer 222
familia 210
mujeres 204
concepción 191
Name: count, dtype: int64
Hora
Lista de las 10 horas con más cantidad de tweets publicados
Plataformas desde las que se publicaron contenidos y su frecuencia
Code
df['source_name'].value_counts()
source_name
Twitter for iPhone 2031
Twitter Web App 1706
Twitter Web Client 1487
Facebook 1468
Twitter for Android 412
Mobile Web 163
TweetDeck 133
erased88075 131
Twitter for Websites 124
Instagram 99
UberSocial for iPhone 22
Mobile Web (M2) 12
iOS 11
Twitter for Android Tablets 10
Twitter for Mac 7
Tweeet! on iOS 4
Hootsuite Inc. 3
Buffer 3
Hootsuite 2
Twibbon 1
Periscope 1
Name: count, dtype: int64